Web scraping and analysis¶

In [4]:
#Scraping data from Skytrax
In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
In [7]:
import pandas as pd
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

reviews = []

# for i in range(1, pages + 1):
for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())
    
    print(f"   ---> {len(reviews)} total reviews")
Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews
In [8]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()
Out[8]:
reviews
0 ✅ Trip Verified | Prior to boarding a gate a...
1 ✅ Trip Verified | I flew from Amsterdam to L...
2 ✅ Trip Verified | First the good news, the clu...
3 ✅ Trip Verified | I have never travelled wit...
4 ✅ Trip Verified | Terrible overall, medium ser...
In [10]:
# saving data into CSV
In [9]:
df.to_csv("data/BA_reviews.csv")

Data Cleaning¶

In [11]:
#imports

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

#regex
import re
In [14]:
import os
import pandas as pd

# Get the current working directory
cwd = os.getcwd()
print(f"Current working directory: {cwd}")  # Debug: Check where you are

# Define the correct path to your CSV file
file_path = "C:/Users/sangr/data/BA_reviews.csv"

# Read the CSV file into a DataFrame
df = pd.read_csv(file_path, index_col=0)

# Display the first few rows to verify
print(df.head())
Current working directory: C:\Users\sangr
                                             reviews
0  ✅ Trip Verified |   Prior to boarding a gate a...
1  ✅ Trip Verified |   I flew from Amsterdam to L...
2  ✅ Trip Verified | First the good news, the clu...
3  ✅ Trip Verified |   I have never travelled wit...
4  ✅ Trip Verified | Terrible overall, medium ser...
In [16]:
#  will also create a column which mentions if the user is verified or not.
In [17]:
df['verified'] = df.reviews.str.contains("Trip Verified")
In [18]:
df['verified']
Out[18]:
0       True
1       True
2       True
3       True
4       True
       ...  
995     True
996    False
997     True
998     True
999     True
Name: verified, Length: 1000, dtype: bool

Cleaning Reviews¶

In [20]:
import nltk
nltk.download('wordnet')  
nltk.download('stopwords')  
nltk.download('punkt')  
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sangr\AppData\Roaming\nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sangr\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sangr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Out[20]:
True
In [21]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
In [24]:
lemma = WordNetLemmatizer()
reviews_data = df.reviews.str.strip("✅ Trip Verified |")

#create an empty list to collect cleaned data corpus
corpus =[]

#loop through each review, remove punctuations, small case it, join it and add it to corpus
for rev in reviews_data:
    rev = re.sub('[^a-zA-Z]',' ', rev)
    rev = rev.lower()
    rev = rev.split()
    rev = [lemma.lemmatize(word) for word in rev if word not in set(stopwords.words("english"))]
    rev = " ".join(rev)
    corpus.append(rev)
In [25]:
# add the corpus to the original dataframe

df['corpus'] = corpus
In [26]:
df.head()
Out[26]:
reviews verified corpus
0 ✅ Trip Verified | Prior to boarding a gate a... True prior boarding gate agent seemed pick elderly ...
1 ✅ Trip Verified | I flew from Amsterdam to L... True flew amsterdam la vega layover heathrow novemb...
2 ✅ Trip Verified | First the good news, the clu... True first good news club suite huge improvement ol...
3 ✅ Trip Verified | I have never travelled wit... True never travelled british airway first time chos...
4 ✅ Trip Verified | Terrible overall, medium ser... True ble overall medium service flight delayed help...
In [27]:
#Cleaning/Fromat date
In [30]:
df.dtypes
Out[30]:
reviews     object
verified      bool
corpus      object
dtype: object
In [32]:
#Check for null Values
In [33]:
df.isnull().value_counts()
Out[33]:
reviews  verified  corpus
False    False     False     1000
Name: count, dtype: int64
In [35]:
df.shape
Out[35]:
(1000, 3)
In [36]:
#resetting the index
df.reset_index(drop=True)
Out[36]:
reviews verified corpus
0 ✅ Trip Verified | Prior to boarding a gate a... True prior boarding gate agent seemed pick elderly ...
1 ✅ Trip Verified | I flew from Amsterdam to L... True flew amsterdam la vega layover heathrow novemb...
2 ✅ Trip Verified | First the good news, the clu... True first good news club suite huge improvement ol...
3 ✅ Trip Verified | I have never travelled wit... True never travelled british airway first time chos...
4 ✅ Trip Verified | Terrible overall, medium ser... True ble overall medium service flight delayed help...
... ... ... ...
995 ✅ Trip Verified | I have to say travelling in ... True say travelling club europe waste money food be...
996 Not Verified | I had a stress free journey wi... False verified stress free journey yr old autistic s...
997 ✅ Trip Verified | Edinburgh to Kuala Lumpur v... True edinburgh kuala lumpur via london returned kl ...
998 ✅ Trip Verified | I was supposed to fly from ... True supposed fly london city amsterdam business cl...
999 ✅ Trip Verified | I purchased a ticket for Du... True purchased ticket dublin mauritius british airw...

1000 rows × 3 columns

In [37]:
# export the cleaned data

df.to_csv(cwd + "/cleaned-BA-reviews.csv")

Cleaning is done¶

In [ ]:
 

EDA¶

In [38]:
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import datetime as dt

from wordcloud import WordCloud, STOPWORDS
In [39]:
# create the dataframe
cwd = os.getcwd()
df = pd.read_csv(cwd+"/cleaned-BA-reviews.csv", index_col=0)

#let's also check the index are in order
df = df.reset_index(drop=True)
In [41]:
df.head()
Out[41]:
reviews verified corpus
0 ✅ Trip Verified | Prior to boarding a gate a... True prior boarding gate agent seemed pick elderly ...
1 ✅ Trip Verified | I flew from Amsterdam to L... True flew amsterdam la vega layover heathrow novemb...
2 ✅ Trip Verified | First the good news, the clu... True first good news club suite huge improvement ol...
3 ✅ Trip Verified | I have never travelled wit... True never travelled british airway first time chos...
4 ✅ Trip Verified | Terrible overall, medium ser... True ble overall medium service flight delayed help...
In [42]:
#What is the average overall rating given for British Airways?
In [46]:
# Load your file (replace 'your_file.csv' with your actual file name)
data = pd.read_csv("C:/Users/sangr/Downloads/cleaned-BA-reviews.csv")
In [48]:
# 1. Basic Overview
print("Dataset Shape:", data.shape)  # Number of rows and columns
print("\nDataset Info:")
print(data.info())  # Data types and missing values
print("\nFirst 5 Rows:")
print(data.head())  # Preview the data
Dataset Shape: (3411, 7)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3411 entries, 0 to 3410
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  3411 non-null   int64 
 1   reviews     3411 non-null   object
 2   stars       3411 non-null   int64 
 3   date        3411 non-null   object
 4   country     3411 non-null   object
 5   verified    3411 non-null   bool  
 6   corpus      3411 non-null   object
dtypes: bool(1), int64(2), object(4)
memory usage: 163.4+ KB
None

First 5 Rows:
   Unnamed: 0                                            reviews  stars  \
0           0  Not Verified | Worst experience ever. Outbound...      5   
1           1  ✅ Trip Verified |  Check in was a shambles at ...      1   
2           2  ✅ Trip Verified | Beyond disgusted with the fa...      5   
3           3  ✅ Trip Verified | On July 19th 2022 I had subm...      1   
4           4  ✅ Trip Verified |  I booked the flight on Oct ...      1   

         date               country  verified  \
0  2022-11-07                 Italy     False   
1  2022-11-07              Malaysia      True   
2  2022-11-05  United Arab Emirates      True   
3  2022-10-31         United States      True   
4  2022-10-31         United States      True   

                                              corpus  
0  verified worst experience ever outbound flight...  
1  check shamble bwi counter open full flight bag...  
2  beyond disgusted fact baggage yet delivered we...  
3  july th submitted complaint form regard fact b...  
4  booked flight oct cancel flight day learning g...  
In [49]:
# 2. Summary Statistics
print("\nSummary Statistics:")
print(data.describe())  # Mean, median, std, etc. for numeric columns
Summary Statistics:
        Unnamed: 0        stars
count  3411.000000  3411.000000
mean   1705.766051     4.841102
std     985.966935     3.144230
min       0.000000     1.000000
25%     852.500000     2.000000
50%    1705.000000     4.000000
75%    2557.500000     8.000000
max    3417.000000    10.000000
In [51]:
# 3. Check for Missing Values
print("\nMissing Values:")
print(data.isnull().sum())
Missing Values:
Unnamed: 0    0
reviews       0
stars         0
date          0
country       0
verified      0
corpus        0
dtype: int64
In [52]:
# 4. Visualizations
# Histogram for numeric columns
data.hist(bins=20, figsize=(10, 8))
plt.tight_layout()
plt.show()
In [57]:
# Correlation heatmap (for numeric data)
numeric_data = data.select_dtypes(include=['float64', 'int64'])  # Only numeric columns
if not numeric_data.empty:
    plt.figure(figsize=(8, 6))
    sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.show()
else:
    print("No numeric columns available for correlation heatmap.")
In [59]:
print(df.columns)
Index(['reviews', 'verified', 'corpus'], dtype='object')
In [60]:
df = pd.read_csv("C:/Users/sangr/Downloads/cleaned-BA-reviews.csv", encoding="utf-8", low_memory=False)
print(df.columns)
Index(['Unnamed: 0', 'reviews', 'stars', 'date', 'country', 'verified',
       'corpus'],
      dtype='object')
In [63]:
# Ensure column names are stripped of any spaces
df.columns = df.columns.str.strip()

# Convert 'stars' column to numeric in case of any issues
df['stars'] = pd.to_numeric(df['stars'], errors='coerce')

# Calculate the average rating
average_rating = df['stars'].mean()

print("Average Rating:", average_rating)
Average Rating: 4.841102316036353

What is the total counts for each ratings?¶

In [65]:
df.stars.value_counts().plot(kind="bar")
plt.xlabel("Ratings")
plt.ylabel("Total Number of reviews with that rating")
plt.suptitle("Counts for each ratings")
Out[65]:
Text(0.5, 0.98, 'Counts for each ratings')
In [67]:
#resetting index as we do not want to confuse between the index and the rating values
df_ratings = df_ratings.reset_index()
In [68]:
# renaming columns
df_ratings.rename(columns={'index':'Stars', 'stars':'total_counts'}, inplace=True)
In [69]:
df_ratings
Out[69]:
total_counts count
0 1 735
1 2 382
2 3 379
3 8 349
4 10 306
5 7 299
6 9 293
7 5 259
8 4 227
9 6 182
In [81]:
df_ratings.columns = df_ratings.columns.str.strip().str.lower()  # Standardize column names
print(df_ratings.columns)
Index(['stars', 'total_counts'], dtype='object')
In [82]:
clrs = ['Red' if (x ==  max(df_ratings.total_counts)) else 'grey' for x in df_ratings.total_counts ]
ax = sns.barplot(x=df_ratings.stars, y=df_ratings.total_counts, data=df_ratings, errwidth=0,
                palette=clrs)


ax.bar_label(ax.containers[0])
    
ax.set_xlabel("Ratings")
ax.set_ylabel("Total Number of reviews with that rating")
ax.set_title("Counts for each ratings")
Out[82]:
Text(0.5, 1.0, 'Counts for each ratings')
In [83]:
# Unique countries BA recieved the reviews from

print(f"{len(df.country.unique())} unique countries")
69 unique countries
In [84]:
# Which country most review comes from?
In [87]:
df_country_review = pd.DataFrame(df.country.value_counts().head()).reset_index()
In [88]:
df_country_review.rename(columns={'index':'country','country':'total_reviews'}, inplace=True)
In [91]:
print(df.columns)
Index(['Unnamed: 0', 'reviews', 'stars', 'date', 'country', 'verified',
       'corpus'],
      dtype='object')
In [93]:
df_country_review = df.groupby("country").size().reset_index(name="total_reviews")
In [94]:
df.columns = df.columns.str.strip().str.lower()
print(df.columns)  # Check again
Index(['unnamed: 0', 'reviews', 'stars', 'date', 'country', 'verified',
       'corpus'],
      dtype='object')
In [105]:
import pandas as pd
import matplotlib.pyplot as plt

# Check if 'country' exists
if 'country' in df.columns:
    # Count number of reviews per country
    df_country_review = df.groupby("country").size().reset_index(name="total_reviews")
    
    # Get top 5 countries with most reviews
    top_countries = df_country_review.sort_values(by="total_reviews", ascending=False).head(5)

    # Define vibrant colors
    vibrant_colors = ['#FF355E', '#FD5B78', '#FF6037', '#FFCC33', '#66FF66']  # Cherry Red, Coral, Orange, Yellow, Lime Green
    
    # Plot the pie chart
    plt.figure(figsize=(8, 6))
    plt.pie(top_countries["total_reviews"], 
            labels=top_countries["country"], 
            autopct='%1.1f%%', 
            colors=vibrant_colors, 
            startangle=140, 
            wedgeprops={'edgecolor': 'white', 'linewidth': 2},  
            textprops={'fontsize': 12, 'color': 'black'})      
    plt.title("Top 5 Countries by Number of Reviews", 
              fontsize=16, 
              color='#FF355E', 
              pad=20) 
    
    # Add a slight shadow effect
    plt.pie(top_countries["total_reviews"], 
            labels=None,  
            colors=vibrant_colors, 
            startangle=140, 
            radius=0.85,  
            wedgeprops={'edgecolor': 'gray', 'linewidth': 1, 'alpha': 0.3})

    plt.show()

else:
    print("❌ 'country' column not found in the dataset!")
In [106]:
# Which country provided on average highest ratings?
In [111]:
import pandas as pd
import matplotlib.pyplot as plt

# Check if 'country' and 'stars' exist
if 'country' in df.columns and 'stars' in df.columns:
    # Ensure 'stars' is numeric (convert if needed)
    df['stars'] = pd.to_numeric(df['stars'], errors='coerce')  # Converts non-numeric to NaN
    
    # Group by country and calculate mean stars, then sort and reset index
    df_country_rating = pd.DataFrame(
        df.groupby('country')['stars'].mean().sort_values(ascending=False)
    ).reset_index()

    # Get top 10 countries by average rating (changed from 5 to 10)
    top_countries = df_country_rating.head(10)

    # Define vibrant colors (extended to 10 colors)
    vibrant_colors = ['#FF355E', '#FD5B78', '#FF6037', '#FFCC33', '#66FF66', 
                      '#00CCCC', '#FF00FF', '#FF6F61', '#6B5B95', '#88B04B']  # Added Cyan, Magenta, Coral, Purple, Olive Green
    
    # Plot the bar chart
    plt.figure(figsize=(12, 6))  # Slightly wider figure to accommodate 10 bars
    bars = plt.bar(top_countries['country'], 
                   top_countries['stars'], 
                   color=vibrant_colors, 
                   edgecolor='white',  # White borders for contrast
                   linewidth=2)        # Border thickness

    # Customize the chart
    plt.title("Top 10 Countries by Average Rating", 
              fontsize=16, 
              color='#FF355E', 
              pad=20)  # Updated title
    
    plt.xlabel("Country", fontsize=12, color='black')
    plt.ylabel("Average Rating", fontsize=12, color='black')
    
    # Add value labels on top of each bar
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval + 0.05, 
                 f'{yval:.1f}',  # Display value with 1 decimal
                 ha='center', va='bottom', fontsize=10, color='black')

    # Adjust layout and grid
    plt.grid(axis='y', linestyle='--', alpha=0.7)  # Light grid on y-axis
    plt.xticks(rotation=45, ha='right')  # Rotate country names for better fit
    
    plt.tight_layout()
    plt.show()

else:
    print("❌ Required columns ('country' or 'stars') not found in the dataset!")

Time Series Analysis

In [112]:
#convert the date datatype to datetime

df.date = pd.to_datetime(df.date)
In [113]:
fig = px.line(df, x='date', y="stars")
fig.update_xaxes(rangeslider_visible=True)
fig.show()

It can be seen that between 2020 to 2021 there has been a decline in reviews due to Covid Pandemic travel restrictions. Not much can be inferred at this point as the dates we have are the when the customers posted their reviews which does not take into account the actual flight date. Hence no particular significan trend is visible from the plot.¶

In [115]:
import nltk
from nltk.corpus import stopwords
# Start with one review:
reviews = " ".join(df.corpus)
plt.figure(figsize=(10,10))

stopwords = set(stopwords.words('english'))

# Create and generate a word cloud image:
wordcloud = WordCloud(height=600,width=600,max_font_size=100, max_words=500, stopwords=stopwords).generate(reviews)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

There are many words that does not set the idea of whether the review is positive or negative. For example words like "passenger", "flight", etc. does not add conlcusive value hence we can include them in stopwords list.¶

In [117]:
import nltk
from nltk.corpus import stopwords
reviews = " ".join(df.corpus)
plt.figure(figsize=(20,10))

stopwords = set(stopwords.words('english'))
stopwords.update(["ba","flight", "british","airway", "airline","plane", "told","also","passenger" \
                 "london", "heathrow", "aircraft", "could","even", "would"])
# Create and generate a word cloud image:
wordcloud = WordCloud(height=500,width=500,max_font_size=100, max_words=300, stopwords=stopwords).generate(reviews)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

Word Frequency¶

In [118]:
from nltk import ngrams
from nltk.probability import FreqDist

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer

#split the text of all reviews into a list of words
words = reviews.split(" ")

#remove certain words that will not be used to determine the positive or negative sentiment
stopwords = text.ENGLISH_STOP_WORDS.union(['flight', 'ba', "passenger","u", "london","airway","british","airline",\
                                           "heathrow","plane","lhr","review"])


new_words = [word for word in words if word not in stopwords]

nlp_words=FreqDist(new_words).most_common(20)

#create a dataframe of these word and its frequencies
all_fdist = pd.Series(dict(nlp_words))
In [119]:
## Setting figure, ax into variables
fig, ax = plt.subplots(figsize=(15,8))

## Seaborn plotting using Pandas attributes + xtick rotation for ease of viewing
all_plot = sns.barplot(x=all_fdist.index, y=all_fdist.values, ax=ax)
all_plot.bar_label(all_plot.containers[0])
plt.xticks(rotation=30)
Out[119]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19]),
 [Text(0, 0, 'seat'),
  Text(1, 0, 'service'),
  Text(2, 0, 'food'),
  Text(3, 0, 'time'),
  Text(4, 0, 'crew'),
  Text(5, 0, 'cabin'),
  Text(6, 0, 'good'),
  Text(7, 0, 'class'),
  Text(8, 0, 'hour'),
  Text(9, 0, 'business'),
  Text(10, 0, 'staff'),
  Text(11, 0, 'economy'),
  Text(12, 0, 'check'),
  Text(13, 0, 'drink'),
  Text(14, 0, 'meal'),
  Text(15, 0, 'return'),
  Text(16, 0, 'lounge'),
  Text(17, 0, 'club'),
  Text(18, 0, 'boarding'),
  Text(19, 0, 'experience')])

This gives us a glimpse of what customers are really talking about here. We see that Seat is most talked about the airline followed by "Service" and "food" which are all very important to customers in terms of service. However, we still do not know is how they are expressing about each of this service. To bring some significane to these terms we will use ngram plots to see if they are bad or good in experience.

Word Frequency with N-gram¶

In [120]:
## Imports
import nltk.collocations as collocations
from nltk import FreqDist, bigrams

reviews = " ".join(df.corpus)

#split the text of all reviews into a list of words
words = reviews.split(" ")

new_words = [word for word in words if word not in stopwords]

def get_freq_dist(new_words,number_of_ngrams ):
    from nltk import ngrams
    
    ## Generate bigrams
    ngrams = ngrams(new_words, number_of_ngrams)

    ## Creating FreqDist
    ngram_fd = FreqDist(ngrams).most_common(40)

    ## Sort values by highest frequency
    ngram_sorted = {k:v for k,v in sorted(ngram_fd, key=lambda item:item[1])}

    ## Join bigram tokens with '_' + maintain sorting
    ngram_joined = {'_'.join(k):v for k,v in sorted(ngram_fd, key=lambda item:item[1])}

    ## Convert to Pandas series for easy plotting
    ngram_freqdist = pd.Series(ngram_joined)
    plt.figure(figsize=(10,10))
    ax = ngram_freqdist.plot(kind="barh")
    
    return ax


get_freq_dist(new_words,4)
Out[120]:
<Axes: >

We can see that there are very common positive terms regarding cabin crew. For example, cabin_crew_friendly_helpful, cabin_crew_friendly_attentive, cabin_crew_friendly_efficient, etc. So certainly customers are providing good reviews about cabin crew staff of British Airways.

However, there is one another approach that we can try to find the word frequencies which will give us better idea. We will group the reviews based on ratings. Say, we assume ratigs 1-3 are bad reviews, 4-6 are average/good experience and 7-10 indicates a great experience.

In [121]:
ratings_1_3 = df[df.stars.isin([1,2,3])]
ratings_4_6 = df[df.stars.isin([4,5,6])]
ratings_7_10 = df[df.stars.isin([7,8,9,10])]
In [122]:
reviews_1_3 = " ".join(ratings_1_3.corpus)
reviews_4_6 = " ".join(ratings_4_6.corpus)
reviews_7_10 = " ".join(ratings_7_10.corpus)

#split the text of all reviews into a list of words
words_1_3 = reviews_1_3.split(" ")
words_4_6 = reviews_4_6.split(" ")
words_7_10 = reviews_7_10.split(" ")


new_words_7_10 = [word for word in words_7_10 if word not in stopwords]

get_freq_dist(new_words_7_10,4)
Out[122]:
<Axes: >
In [123]:
new_words = [word for word in words_4_6 if word not in stopwords]

get_freq_dist(new_words,4)
Out[123]:
<Axes: >
In [124]:
new_words = [word for word in words_1_3 if word not in stopwords]

get_freq_dist(new_words,4)
Out[124]:
<Axes: >

Negative or Positive TEXT¶

In [126]:
!pip install textblob
Collecting textblob
  Obtaining dependency information for textblob from https://files.pythonhosted.org/packages/1e/d6/40aa5aead775582ea0cf35870e5a3f16fab4b967f1ad2debe675f673f923/textblob-0.19.0-py3-none-any.whl.metadata
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Collecting nltk>=3.9 (from textblob)
  Obtaining dependency information for nltk>=3.9 from https://files.pythonhosted.org/packages/4d/66/7d9e26593edda06e8cb531874633f7c2372279c3b0f46235539fe546df8b/nltk-3.9.1-py3-none-any.whl.metadata
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Requirement already satisfied: click in c:\users\sangr\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (8.0.4)
Requirement already satisfied: joblib in c:\users\sangr\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (1.2.0)
Requirement already satisfied: regex>=2021.8.3 in c:\users\sangr\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (2022.7.9)
Requirement already satisfied: tqdm in c:\users\sangr\anaconda3\lib\site-packages (from nltk>=3.9->textblob) (4.65.0)
Requirement already satisfied: colorama in c:\users\sangr\anaconda3\lib\site-packages (from click->nltk>=3.9->textblob) (0.4.6)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
    --------------------------------------- 10.2/624.3 kB ? eta -:--:--
   --- ----------------------------------- 61.4/624.3 kB 812.7 kB/s eta 0:00:01
   ---------------- ----------------------- 256.0/624.3 kB 2.3 MB/s eta 0:00:01
   ---------------------------------------  614.4/624.3 kB 4.3 MB/s eta 0:00:01
   ---------------------------------------- 624.3/624.3 kB 3.6 MB/s eta 0:00:00
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------------------- ------------- 1.0/1.5 MB 31.7 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 19.2 MB/s eta 0:00:00
Installing collected packages: nltk, textblob
  Attempting uninstall: nltk
    Found existing installation: nltk 3.8.1
    Uninstalling nltk-3.8.1:
      Successfully uninstalled nltk-3.8.1
Successfully installed nltk-3.9.1 textblob-0.19.0
In [127]:
%%capture
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer

#set a column Polarity with all 0 values initially
df['polarity'] = 0

for i in range(len(df.corpus)):
    sent= TextBlob(df.corpus[i])
    polarity  = sent.sentiment.polarity
    subjectivity  = sent.sentiment.subjectivity
    df['polarity'][i] = polarity
In [128]:
# let's see how many texts are with positive comments

print(f"{df[(df['polarity'] >-0.2) & (df['polarity'] <0.2)].shape[0]} number of reviews between -0.2 and 0.2 polarity score")

print(f"{df[(df['polarity'] >-0.1) & (df['polarity'] <0.1)].shape[0]} number of reviews between -0.1 and 0.1 polarity score")
2286 number of reviews between -0.2 and 0.2 polarity score
1319 number of reviews between -0.1 and 0.1 polarity score

Polarity score is given between -1 to 1 and more close the value to -1, it indicates negative review and vice versa is true for positive value. If we consider a threshold where any review with polarity greater than 0.2 is positive and less than -0.2 is negative, we are left with 2286 reviews that lies in the neutral zone. To further narrow down this number of neutral reviews, let's take the threshold of 0.1.

We will try another method of labelling the reveiws as positives or negatives. In this we will use VADER algorihtm by nltk library.

In [129]:
%%capture
import nltk

from nltk.sentiment.vader import SentimentIntensityAnalyzer 
vds = SentimentIntensityAnalyzer()
# text = 'This is an extremely entertaining movie'

#set a column Polarity with all 0 values initially
df['label'] = 0
    
for i in range(len(df.corpus)):
    
    score = vds.polarity_scores(df.corpus[i])['compound']
    #print(score)
    if score > 0.2:
        df['label'][i] = 1
        #print("1st")
    elif score < 0:
        df['label'][i] = -1
        #print("2nd")
    else:
        df['label'][i] = 0
In [130]:
df.label.value_counts()
Out[130]:
label
 1    2245
-1    1049
 0     117
Name: count, dtype: int64

Topic Modeling with LDA¶

In [132]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
features = vectorizer.get_feature_names_out()
In [134]:
%%capture
from sklearn.feature_extraction.text import CountVectorizer

#create an object of count vectorizer
vect = CountVectorizer()

#apply transformation
tf = vect.fit_transform(df.corpus).toarray()

# get the feature names with the updated method
tf_feature_names = vect.get_feature_names_out()
In [135]:
from sklearn.decomposition import LatentDirichletAllocation

#declare the number of topics
number_of_topics = 8

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

#fit the term frequency data to the model
model.fit(tf)

#create empty dictionary to store key value pair of topic number and its weights
topic_dict = {}

#loop through model components 
for topic_idx, topic in enumerate(model.components_):
    topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(tf_feature_names[i])
            for i in topic.argsort()[:-10 - 1:-1]]
    topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
            for i in topic.argsort()[:-10 - 1:-1]]
    
df_topic =pd.DataFrame(topic_dict)
In [136]:
df_topic
Out[136]:
Topic 0 words Topic 0 weights Topic 1 words Topic 1 weights Topic 2 words Topic 2 weights Topic 3 words Topic 3 weights Topic 4 words Topic 4 weights Topic 5 words Topic 5 weights Topic 6 words Topic 6 weights Topic 7 words Topic 7 weights
0 seat 1174.6 flight 1320.1 flight 1711.5 class 220.7 seat 1332.1 british 604.6 flight 1602.2 ba 554.5
1 class 638.1 crew 731.1 hour 671.9 seat 182.4 ba 1065.7 flight 582.0 ba 1380.5 flight 541.8
2 business 612.4 time 616.2 london 594.6 business 159.8 good 1051.3 airway 582.0 seat 795.9 customer 520.8
3 flight 357.7 ba 569.8 ba 471.5 flight 63.6 flight 1030.1 food 567.3 airline 490.0 service 376.8
4 airway 299.6 cabin 551.9 bag 459.9 ba 59.6 food 824.4 economy 533.0 staff 480.8 british 288.7
5 british 290.9 service 494.5 heathrow 450.0 passenger 42.3 crew 814.0 service 497.2 one 390.6 airway 285.8
6 ba 284.0 drink 437.4 time 436.4 facing 40.5 cabin 685.0 seat 462.8 service 372.1 refund 232.8
7 would 267.9 good 361.7 airway 406.9 one 40.5 service 585.5 london 425.4 hour 328.2 call 228.8
8 economy 264.7 lounge 350.4 british 404.7 first 32.7 lhr 565.6 airline 404.3 food 316.7 day 219.5
9 get 250.1 food 336.5 check 344.6 service 32.6 club 563.9 meal 381.2 london 299.5 airline 212.0

Topic modeling with NMF¶

In [137]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=2, init='random', random_state=0)
nmf.fit_transform(tf)
Out[137]:
array([[0.        , 0.07167139],
       [0.0715827 , 0.05867791],
       [0.02121372, 0.0350061 ],
       ...,
       [0.11298302, 0.15944847],
       [0.02491683, 0.03516653],
       [0.09953257, 0.        ]])
In [138]:
topic_dict = {}

#loop through model components 
for topic_idx, topic in enumerate(nmf.components_):
    topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(tf_feature_names[i])
            for i in topic.argsort()[:-10 - 1:-1]]
    topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
            for i in topic.argsort()[:-10 - 1:-1]]
    
df_topic =pd.DataFrame(topic_dict)
In [139]:
df_topic
Out[139]:
Topic 0 words Topic 0 weights Topic 1 words Topic 1 weights
0 seat 24.3 flight 22.3
1 ba 12.5 ba 7.6
2 class 10.3 hour 4.9
3 business 8.0 time 4.4
4 food 7.8 london 4.0
5 cabin 7.5 service 3.8
6 service 7.4 airway 3.5
7 good 6.6 british 3.5
8 crew 6.5 would 3.1
9 economy 6.4 staff 2.8
In [ ]: